from pyspark import SparkConf, SparkContext

if __name__ == '__main__':
    conf = SparkConf().setAppName("PyCharmWordCount").setMaster("yarn")
    # 通过SparkConf对象构建SparkContext对象
    sc = SparkContext(conf=conf)
    # 通过SparkContext对象读取文件
    fileRdd = sc.textFile("hdfs://node1:8020/words.txt")
    # 将文件中的每一行按照空格拆分成单词
    wordsRdd = fileRdd.flatMap(lambda line: line.split(" "))
    # 将每一个单词转换为元组，
    wordRdd = wordsRdd.map(lambda x: (x, 1))
    # 根据元组的key分组，将value相加
    resultRdd = wordRdd.reduceByKey(lambda a, b: a + b)
    # 将结果收集到Driver并打印输出
    print(resultRdd.collect())
